In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
from datetime import datetime
In [15]:
df = pd.read_csv("/Users/poojareddykolimi/Downloads/US_Accidents_March23.csv")
In [3]:
df.head()
Out[3]:
ID Source Severity Start_Time End_Time Start_Lat Start_Lng End_Lat End_Lng Distance(mi) ... Roundabout Station Stop Traffic_Calming Traffic_Signal Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight Astronomical_Twilight
0 A-1 Source2 3 2016-02-08 05:46:00 2016-02-08 11:00:00 39.865147 -84.058723 NaN NaN 0.01 ... False False False False False False Night Night Night Night
1 A-2 Source2 2 2016-02-08 06:07:59 2016-02-08 06:37:59 39.928059 -82.831184 NaN NaN 0.01 ... False False False False False False Night Night Night Day
2 A-3 Source2 2 2016-02-08 06:49:27 2016-02-08 07:19:27 39.063148 -84.032608 NaN NaN 0.01 ... False False False False True False Night Night Day Day
3 A-4 Source2 3 2016-02-08 07:23:34 2016-02-08 07:53:34 39.747753 -84.205582 NaN NaN 0.01 ... False False False False False False Night Day Day Day
4 A-5 Source2 2 2016-02-08 07:39:07 2016-02-08 08:09:07 39.627781 -84.188354 NaN NaN 0.01 ... False False False False True False Day Day Day Day

5 rows × 46 columns

Data Preprocessing¶

In [17]:
print("Shape:", df.shape)
print("Columns:", df.columns)
print(df.dtypes)
Shape: (7728394, 46)
Columns: Index(['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat',
       'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description',
       'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone',
       'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')
ID                        object
Source                    object
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
End_Lat                  float64
End_Lng                  float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout                  bool
Station                     bool
Stop                        bool
Traffic_Calming             bool
Traffic_Signal              bool
Turning_Loop                bool
Sunrise_Sunset            object
Civil_Twilight            object
Nautical_Twilight         object
Astronomical_Twilight     object
dtype: object
In [18]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing, 'Percent': missing_percent})
print(missing_df[missing_df['Missing Values'] > 0].sort_values(by='Percent', ascending=False))
                       Missing Values    Percent
End_Lat                       3402762  44.029355
End_Lng                       3402762  44.029355
Precipitation(in)             2203586  28.512858
Wind_Chill(F)                 1999019  25.865904
Wind_Speed(mph)                571233   7.391355
Visibility(mi)                 177098   2.291524
Wind_Direction                 175206   2.267043
Humidity(%)                    174144   2.253301
Weather_Condition              173459   2.244438
Temperature(F)                 163853   2.120143
Pressure(in)                   140679   1.820288
Weather_Timestamp              120228   1.555666
Sunrise_Sunset                  23246   0.300787
Civil_Twilight                  23246   0.300787
Nautical_Twilight               23246   0.300787
Astronomical_Twilight           23246   0.300787
Airport_Code                    22635   0.292881
Street                          10869   0.140637
Timezone                         7808   0.101030
Zipcode                          1915   0.024779
City                              253   0.003274
Description                         5   0.000065
In [19]:
columns_to_drop = [
    'ID', 'Source', 'Description', 'Street',
    'End_Lat', 'End_Lng',
    'Wind_Chill(F)', 'Wind_Direction', 'Airport_Code',
    'Amenity', 'Turning_Loop',
    'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'
]

df.drop(columns=columns_to_drop, inplace=True)
In [20]:
df.columns
Out[20]:
Index(['Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng',
       'Distance(mi)', 'City', 'County', 'State', 'Zipcode', 'Country',
       'Timezone', 'Weather_Timestamp', 'Temperature(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Sunrise_Sunset'],
      dtype='object')
In [22]:
base_score = df['Severity'] * 10
env_score = (
    df['Temperature(F)'].apply(lambda x: 5 if (x < 32 or x > 100) else 0) +
    df['Humidity(%)'].apply(lambda x: 5 if x > 90 else 0) +
    df['Visibility(mi)'].apply(lambda x: 10 if x < 1 else (5 if x < 3 else 0)) +
    df['Wind_Speed(mph)'].apply(lambda x: 5 if x > 30 else 0) +
    df['Precipitation(in)'].apply(lambda x: 10 if x > 0.1 else 0)
)
infra_score = (
    df[['Junction', 'Crossing', 'Railway', 'Roundabout', 'Stop']].sum(axis=1) * 5
    - df[['Traffic_Calming', 'Traffic_Signal']].sum(axis=1) * 2
)
df['Hour'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce').dt.hour
time_score = df['Hour'].apply(lambda h: 5 if h < 6 or h > 20 else 0)
df['Risk_Score'] = base_score + env_score + infra_score + time_score
df['Risk_Score'] = base_score + env_score + infra_score + time_score
df['Risk_Score'] = df['Risk_Score'] / df['Risk_Score'].max() * 100

Exploratory Data Analysis¶

In [7]:
df['State'].value_counts().head(10).plot(kind='bar', title='Top 10 States by Accident Count')
plt.show()

df['City'].value_counts().head(10).plot(kind='bar', title='Top 10 Cities by Accident Count')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [8]:
import folium
from folium.plugins import HeatMap
from IPython.display import display

heat_df = df[['Start_Lat', 'Start_Lng']].dropna().sample(n=10000)
map_center = [heat_df['Start_Lat'].mean(), heat_df['Start_Lng'].mean()]


base_map = folium.Map(location=map_center, zoom_start=5)
HeatMap(heat_df.values.tolist()).add_to(base_map)

display(base_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [9]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'], format='mixed', errors='coerce')
In [10]:
df['Hour'] = df['Start_Time'].dt.hour

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.countplot(x='Hour', data=df, palette='viridis')
plt.title('Accidents by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Number of Accidents')
plt.tight_layout()
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/1026542018.py:7: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Hour', data=df, palette='viridis')
No description has been provided for this image
In [11]:
df['Day'] = df['Start_Time'].dt.day_name()
plt.figure(figsize=(10, 6))
sns.countplot(x='Day', data=df, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
plt.title('Accidents by Day of Week')
Out[11]:
Text(0.5, 1.0, 'Accidents by Day of Week')
No description has been provided for this image
In [12]:
env_features = ['Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']
print(df[env_features].describe())
plt.figure(figsize=(10, 8))
sns.heatmap(df[['Severity', 'Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 
                'Wind_Speed(mph)', 'Precipitation(in)']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
       Temperature(F)   Humidity(%)  Visibility(mi)  Wind_Speed(mph)  \
count    7.564541e+06  7.554250e+06    7.551296e+06     7.157161e+06   
mean     6.166329e+01  6.483104e+01    9.090376e+00     7.685490e+00   
std      1.901365e+01  2.282097e+01    2.688316e+00     5.424983e+00   
min     -8.900000e+01  1.000000e+00    0.000000e+00     0.000000e+00   
25%      4.900000e+01  4.800000e+01    1.000000e+01     4.600000e+00   
50%      6.400000e+01  6.700000e+01    1.000000e+01     7.000000e+00   
75%      7.600000e+01  8.400000e+01    1.000000e+01     1.040000e+01   
max      2.070000e+02  1.000000e+02    1.400000e+02     1.087000e+03   

       Precipitation(in)  
count       5.524808e+06  
mean        8.407210e-03  
std         1.102246e-01  
min         0.000000e+00  
25%         0.000000e+00  
50%         0.000000e+00  
75%         0.000000e+00  
max         3.647000e+01  
Out[12]:
Text(0.5, 1.0, 'Correlation Matrix')
No description has been provided for this image

California Analysis¶

In [23]:
# Extract California data
df_ca = df[df['State'] == 'CA'].copy()
print(f"Number of accidents in California: {len(df_ca)}")
Number of accidents in California: 1741433
In [24]:
# Basic structure
print(df_ca.info())

# Missing values
missing_ca = df_ca.isnull().sum()
missing_percent = (missing_ca / len(df_ca)) * 100
missing_df_ca = pd.DataFrame({'Missing Values': missing_ca, 'Percent': missing_percent})
print(missing_df_ca[missing_df_ca['Missing Values'] > 0].sort_values(by='Percent', ascending=False))
<class 'pandas.core.frame.DataFrame'>
Index: 1741433 entries, 728 to 7728393
Data columns (total 34 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Start_Time         object 
 2   End_Time           object 
 3   Start_Lat          float64
 4   Start_Lng          float64
 5   Distance(mi)       float64
 6   City               object 
 7   County             object 
 8   State              object 
 9   Zipcode            object 
 10  Country            object 
 11  Timezone           object 
 12  Weather_Timestamp  object 
 13  Temperature(F)     float64
 14  Humidity(%)        float64
 15  Pressure(in)       float64
 16  Visibility(mi)     float64
 17  Wind_Speed(mph)    float64
 18  Precipitation(in)  float64
 19  Weather_Condition  object 
 20  Bump               bool   
 21  Crossing           bool   
 22  Give_Way           bool   
 23  Junction           bool   
 24  No_Exit            bool   
 25  Railway            bool   
 26  Roundabout         bool   
 27  Station            bool   
 28  Stop               bool   
 29  Traffic_Calming    bool   
 30  Traffic_Signal     bool   
 31  Sunrise_Sunset     object 
 32  Hour               int32  
 33  Risk_Score         float64
dtypes: bool(11), float64(10), int32(1), int64(1), object(11)
memory usage: 330.5+ MB
None
                   Missing Values    Percent
Precipitation(in)          566204  32.513683
Wind_Speed(mph)            162891   9.353848
Humidity(%)                 48341   2.775932
Temperature(F)              45969   2.639723
Visibility(mi)              40125   2.304137
Weather_Condition           39778   2.284211
Pressure(in)                37126   2.131922
Weather_Timestamp           32805   1.883793
Sunrise_Sunset               1343   0.077120
Zipcode                       597   0.034282
Timezone                      597   0.034282
City                           11   0.000632
In [15]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=df_ca, x='Severity', palette='rocket')
plt.title('Accident Severity Distribution in California')
plt.xlabel('Severity Level')
plt.ylabel('Number of Accidents')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/3786709651.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_ca, x='Severity', palette='rocket')
No description has been provided for this image
In [25]:
df_ca['Start_Time'] = pd.to_datetime(df_ca['Start_Time'], errors='coerce')

# Hour of Day
df_ca['Hour'] = df_ca['Start_Time'].dt.hour
sns.countplot(x='Hour', data=df_ca, palette='viridis')
plt.title('Accidents by Hour (California)')
plt.xlabel('Hour of Day')
plt.ylabel('Accident Count')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_34131/4055911724.py:5: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Hour', data=df_ca, palette='viridis')
No description has been provided for this image
In [12]:
df_ca['Day'] = df_ca['Start_Time'].dt.day_name()
sns.countplot(x='Day', data=df_ca, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], palette='pastel')
plt.title('Accidents by Day of the Week (California)')
plt.xticks(rotation=45)
plt.ylabel('Accident Count')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_34131/620810245.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Day', data=df_ca, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], palette='pastel')
No description has been provided for this image
In [18]:
top_weather_ca = df_ca['Weather_Condition'].value_counts().nlargest(10).index
sns.countplot(y='Weather_Condition', data=df_ca[df_ca['Weather_Condition'].isin(top_weather_ca)],
              order=top_weather_ca, palette='cool')
plt.title('Top Weather Conditions in CA Accidents')
plt.xlabel('Accident Count')
plt.ylabel('Weather Condition')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/1688842507.py:2: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(y='Weather_Condition', data=df_ca[df_ca['Weather_Condition'].isin(top_weather_ca)],
No description has been provided for this image
In [19]:
# Top 15 cities in CA
top_cities = df_ca['City'].value_counts().nlargest(15)
top_cities.plot(kind='barh', figsize=(10,6), color='skyblue')
plt.title('Top 15 Cities by Number of Accidents in CA')
plt.xlabel('Accident Count')
plt.gca().invert_yaxis()
plt.show()
No description has been provided for this image
In [26]:
# env_cols = ['Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Pressure(in)']
# print(df_ca[env_cols].describe())
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Convert Start_Time if not already
df_ca['Start_Time'] = pd.to_datetime(df_ca['Start_Time'], errors='coerce')

# Extract hour and define a new column
df_ca['Hour'] = df_ca['Start_Time'].dt.hour
df_ca['Sunset_Surge'] = df_ca['Hour'].apply(lambda x: 'After Sunset (6-9PM)' if 18 <= x <= 21 else
                                            ('Late Night' if 22 <= x <= 4 else
                                             'Rest of Day'))

# Countplot
sns.countplot(data=df_ca, x='Sunset_Surge', order=['After Sunset (6-9PM)', 'Late Night', 'Rest of Day'], palette='autumn')
plt.title('Accident Frequency During and After Sunset in CA')
plt.ylabel('Number of Accidents')
plt.xlabel('Time Category')
plt.show()
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_34131/4063657266.py:17: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df_ca, x='Sunset_Surge', order=['After Sunset (6-9PM)', 'Late Night', 'Rest of Day'], palette='autumn')
No description has been provided for this image
In [ ]:
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

# Filter for Los Angeles
df_la = df[(df['State'] == 'CA') & (df['City'].str.lower() == 'los angeles')]
coords = df_la[['Start_Lat', 'Start_Lng']].dropna().to_numpy()

# Optional: Sample for performance
coords_sample = coords if len(coords) < 10000 else coords[:10000]

# Haversine DBSCAN requires radians
from sklearn.preprocessing import StandardScaler
kms_per_radian = 6371.0088
epsilon = 1 / kms_per_radian  # 1 km radius

db = DBSCAN(eps=epsilon, min_samples=20, algorithm='ball_tree', metric='haversine').fit(np.radians(coords_sample))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
print(f"Number of clusters (hotspots) in LA: {num_clusters}")


df_clusters = pd.DataFrame(coords_sample, columns=['lat', 'lng'])
df_clusters['cluster'] = cluster_labels

# Plot clusters
plt.figure(figsize=(10, 8))
colors = plt.cm.get_cmap('tab10', num_clusters)
for cluster_num in range(num_clusters):
    cluster = df_clusters[df_clusters.cluster == cluster_num]
    plt.scatter(cluster['lng'], cluster['lat'], s=10, label=f'Cluster {cluster_num}', alpha=0.6)

# Noise points (label -1)
noise = df_clusters[df_clusters.cluster == -1]
plt.scatter(noise['lng'], noise['lat'], s=5, c='gray', alpha=0.3, label='Noise')

plt.title('Accident Hotspots in Los Angeles (DBSCAN Clusters)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Number of clusters (hotspots) in LA: 9
/var/folders/bm/jkk1ylls6z9_16zmffdppvwh0000gn/T/ipykernel_33694/1774305026.py:28: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  colors = plt.cm.get_cmap('tab10', num_clusters)
No description has been provided for this image
In [61]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
from sklearn.cluster import DBSCAN
from IPython.display import display


# Filter for LA accidents
df_ca = df[(df['State'] == 'CA')]
coords = df_ca[['Start_Lat', 'Start_Lng']].dropna().sample(n=3000, random_state=42)

# DBSCAN clustering
kms_per_radian = 6371.0088
epsilon = 1.0 / kms_per_radian  # ~1km
db = DBSCAN(eps=epsilon, min_samples=20, algorithm='ball_tree', metric='haversine')
cluster_labels = db.fit_predict(np.radians(coords[['Start_Lat', 'Start_Lng']]))
coords['Cluster'] = cluster_labels

# Initialize map
la_center = [coords['Start_Lat'].mean(), coords['Start_Lng'].mean()]
folium_map = folium.Map(location=la_center, zoom_start=11, tiles='CartoDB positron')


marker_cluster = MarkerCluster().add_to(folium_map)


colors = ['red', 'blue', 'green', 'orange', 'purple', 'pink', 'brown', 'gray', 'cadetblue', 'darkred']

for idx, row in coords.iterrows():
    cluster_id = row['Cluster']
    color = colors[int(cluster_id) % len(colors)] if cluster_id != -1 else 'lightgray'
    folium.CircleMarker(
        location=(row['Start_Lat'], row['Start_Lng']),
        radius=3,
        color=color,
        fill=True,
        fill_opacity=0.7,
        popup=f"Cluster {int(cluster_id)}" if cluster_id != -1 else "Noise"
    ).add_to(marker_cluster)


display(folium_map)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]: